/*** This do-file processes Current Employment Survey raw data to produce two 
files used for employment benchmarks.
***/

*-------------------------------------------------------------------------------
* Set up
*-------------------------------------------------------------------------------

* Set $root 
project figstabs, root
if (r(buildrunning)==0) include "${root}/code/config_interactive.do"

* Set globals
project, uses("${root}/code/set_globals.do")
include "${root}/code/set_globals.do"
local category "Employment"

* Create required subfolders
cap mkdir "${root}/data/derived/CES"

*-------------------------------------------------------------------------------
**# 1. National monthly employment by NAICS (thousands)
*-------------------------------------------------------------------------------

project, uses("${root}/data/dvc/CES/ce.data.0.AllCESSeries.csv")
import delimited "${root}/data/dvc/CES/ce.data.0.AllCESSeries.csv", clear
drop footnote_codes 

* Get only year 2020
keep if year == 2020

* Get monthly data 
drop if period == "M13" // annual average 

* Split series ID into its component parts 
replace series_id = strtrim(series_id)
assert strlen(series_id) == 13

gen survey_abbreviation = substr(series_id, 1, 2)
gen seasonal_code = substr(series_id, 3, 1)
gen industry_code = substr(series_id, 4, 8)
gen data_type_code = substr(series_id, 12, 2)

* Map industry code to 2-digit NAICS
preserve 
project, uses("${root}/data/dvc/CES/ce.industry.csv")
import delimited "${root}/data/dvc/CES/ce.industry.csv", stringcols(_all) clear 
gisid industry_code 

keep industry_code naics_code 
replace naics_code = strtrim(naics_code)

count if strlen(naics_code) == 2 & inlist(naics_code, "31", "32", "33", "44", "45", "48", "49")
assert r(N) == 0                  // these industries are not represented as 2-digit NAICS in the raw data. So we instead aggregate the 3-digit NAICS.

replace naics_code = "31-33" if inlist(substr(naics_code, 1, 2), "31", "32", "33") & strlen(naics_code) == 3
replace naics_code = "44-45" if inlist(substr(naics_code, 1, 2), "44", "45") & strlen(naics_code) == 3
replace naics_code = "48-49" if inlist(substr(naics_code, 1, 2), "48", "49") & strlen(naics_code) == 3

keep if strlen(naics_code) == 2 | inlist(naics_code, "31-33", "44-45", "48-49")

tempfile naics_mapping
save `naics_mapping'
restore

merge m:1 industry_code using `naics_mapping', assert(1 3) keep(3) nogen

* Get employment (thousands)
keep if data_type_code == "01"
destring value, replace 
rename value employment

* Keep data that is not seasonally adjusted 
keep if seasonal_code == "U"

* Month 
gen month = real(subinstr(period, "M", "", .))

* Collapse employment by NAICS and month
gisid year month industry_code 
gcollapse employment, by(year month naics_code)
rename naics_code naics 

save "${root}/data/derived/CES/national_industry_employment.dta", replace 
project, creates("${root}/data/derived/CES/national_industry_employment.dta")

*-------------------------------------------------------------------------------
**# 2. State-level monthly total employment (thousands)
*-------------------------------------------------------------------------------

project, uses("${root}/data/dvc/CES/sm.data.1.AllData.csv")
import delimited "${root}/data/dvc/CES/sm.data.1.AllData.csv", clear
drop footnote_codes

* Get only years 2020 and 2021
keep if inrange(year, 2020, 2021)

* Get monthly data 
drop if period == "M13" // annual average 

* Split series ID into its component parts 
replace series_id = strtrim(series_id)
assert strlen(series_id) == 20

gen survey_abbreviation = substr(series_id, 1, 2)
gen seasonal_code = substr(series_id, 3, 1)
gen statefips = substr(series_id, 4, 2)
gen area_code = substr(series_id, 6, 5)
gen industry_code = substr(series_id, 11, 8)
gen data_type_code = substr(series_id, 19, 2)

* Get state-level aggregates 
keep if area_code == "00000" & industry_code == "00000000"

* Keep 50 states + DC
drop if inlist(statefips, "00", "72", "78")
destring statefips, replace 
gunique statefips 
assert r(unique) == 51

* Get employment (thousands)
keep if data_type_code == "01"
destring value, replace 
rename value employment

* Keep data that is not seasonally adjusted 
keep if seasonal_code == "U"

* Month 
gen month = real(subinstr(period, "M", "", .))

* Keep relevant variables and save 
gisid year month statefips
keep year month statefips employment

save "${root}/data/derived/CES/state_monthly_total_employment.dta", replace 
project, creates("${root}/data/derived/CES/state_monthly_total_employment.dta")
